R packages to be used.

if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
if (!require('tidyr')) install.packages('tidyr'); library('tidyr')
if (!require('lubridate')) install.packages('lubridate'); library('lubridate')
if (!require('tidyquant')) install.packages('tidyquant'); library('tidyquant')
if (!require('plotly')) install.packages('plotly'); library('plotly')
# Training
if (!require('caret')) install.packages('caret'); library('caret')
# Time Series (ARIMA)
if (!require('forecast')) install.packages('forecast'); library('forecast')
# Moving Average
if (!require('zoo')) install.packages('zoo'); library('zoo')

Data Import

df_country <- read.csv("./cases_malaysia.csv", header=TRUE)
# Convert date column & sort
df_country$date <- as.Date(df_country$date, format="%Y-%m-%d")
df_country <- df_country[order(df_country$date),]
# Filter data
df_data <- df_country[,c("date", "cases_new", "cases_active")]

Linear train

split_ratio <- 0.7
set.seed(168)
split_index <- createDataPartition(df_data$cases_new, p=split_ratio, list=FALSE)
data_train <- df_data[split_index,]
data_test <- df_data[-split_index,]

linear_model <- lm(cases_new~cases_active,data=data_train)
summary(linear_model)
## 
## Call:
## lm(formula = cases_new ~ cases_active, data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3742.5  -215.4  -127.7   221.8  4264.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.286e+02  5.175e+01   2.485   0.0133 *  
## cases_active 8.244e-02  6.365e-04 129.506   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 948.8 on 494 degrees of freedom
## Multiple R-squared:  0.9714, Adjusted R-squared:  0.9713 
## F-statistic: 1.677e+04 on 1 and 494 DF,  p-value: < 2.2e-16
plot(linear_model)

linear_prediction <- linear_model %>% predict(data_test)
linear_compare <- data.frame(actual=data_test$cases_new, predicted=linear_prediction)
head(linear_compare)
##    actual predicted
## 1       4  128.9024
## 5       3  129.1497
## 8       0  129.2322
## 10      0  129.2322
## 14      1  129.7268
## 16      1  129.7268
linear_performance <- data.frame(
  MODEL = "Gaussian Linear",
  R2 = R2(linear_prediction, data_test$cases_new),
  RMSE = RMSE(linear_prediction, data_test$cases_new),
  MAE = MAE(linear_prediction, data_test$cases_new)
)
linear_performance
##             MODEL        R2     RMSE      MAE
## 1 Gaussian Linear 0.9721543 906.3865 545.7654
# Chart init
df_predicted <- data.frame(date=data_test$date, cases_new=linear_prediction)
df_actual <- data_test
df_train <- data_train

lm_chart <- plot_ly()
# Predicted Data
lm_chart <- lm_chart %>% 
  add_trace(
    x = df_predicted[["date"]], y = df_predicted[["cases_new"]],
    name = "Predicted Data",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'red', width = 3)
  )
# Test Data
lm_chart <- lm_chart %>% 
  add_trace(
    x = df_actual[["date"]], y = df_actual[["cases_new"]],
    name = "Actual Data",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'skyblue', width = 3)
  )

lm_chart <- lm_chart %>% 
  add_trace(
    x = df_train[["date"]], y = df_train[["cases_new"]], 
    name = "Train Data",
    type = "scatter",
    mode = "lines",
    line = list(color = 'green', width = 2)
  )

# Set figure title, x and y-axes titles
lm_chart <- lm_chart %>% layout(
  title = "Linear Regression of Daily New Cases",
  xaxis = list(title="Recorded Time"),
  yaxis = list(title="Daily Count of New Cases")
)%>%
  layout(plot_bgcolor='#e5ecf6',
          xaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff'),
          yaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff')
          )


lm_chart

Predict future new cases Using Predicted future active cases from ARIMA Model

init_year <- format(as.Date(df_data[1,1], format="%Y-%m-%d"),"%Y")
init_day <- yday(as.Date(df_data[1,1], format="%Y-%m-%d"))
data_arima <- ts(df_data$cases_active, start=c(init_year,init_day), frequency=365)
head(data_arima)
## Time Series:
## Start = c(2020, 25) 
## End = c(2020, 30) 
## Frequency = 365 
## [1] 4 4 4 4 7 8

ARIMA Training

arima_model <- auto.arima(df_data$cases_active, trace = TRUE, ic = 'aicc', approximation = FALSE, stepwise = FALSE)
## 
##  ARIMA(0,1,0)                    : 12574.82
##  ARIMA(0,1,0) with drift         : 12576.06
##  ARIMA(0,1,1)                    : 12201.06
##  ARIMA(0,1,1) with drift         : 12202.55
##  ARIMA(0,1,2)                    : 12000.65
##  ARIMA(0,1,2) with drift         : 12002.3
##  ARIMA(0,1,3)                    : 11949.82
##  ARIMA(0,1,3) with drift         : 11951.54
##  ARIMA(0,1,4)                    : 11892.69
##  ARIMA(0,1,4) with drift         : 11894.47
##  ARIMA(0,1,5)                    : 11833.57
##  ARIMA(0,1,5) with drift         : 11835.4
##  ARIMA(1,1,0)                    : 11789.02
##  ARIMA(1,1,0) with drift         : 11790.96
##  ARIMA(1,1,1)                    : 11637.72
##  ARIMA(1,1,1) with drift         : 11639.74
##  ARIMA(1,1,2)                    : 11636.15
##  ARIMA(1,1,2) with drift         : 11638.18
##  ARIMA(1,1,3)                    : 11637.46
##  ARIMA(1,1,3) with drift         : 11639.49
##  ARIMA(1,1,4)                    : 11612.64
##  ARIMA(1,1,4) with drift         : 11614.67
##  ARIMA(2,1,0)                    : 11710.42
##  ARIMA(2,1,0) with drift         : 11712.4
##  ARIMA(2,1,1)                    : 11636.58
##  ARIMA(2,1,1) with drift         : 11638.61
##  ARIMA(2,1,2)                    : 11638.07
##  ARIMA(2,1,2) with drift         : 11640.11
##  ARIMA(2,1,3)                    : Inf
##  ARIMA(2,1,3) with drift         : 11640.24
##  ARIMA(3,1,0)                    : 11684.41
##  ARIMA(3,1,0) with drift         : 11686.42
##  ARIMA(3,1,1)                    : 11636.13
##  ARIMA(3,1,1) with drift         : 11638.16
##  ARIMA(3,1,2)                    : Inf
##  ARIMA(3,1,2) with drift         : Inf
##  ARIMA(4,1,0)                    : 11635.54
##  ARIMA(4,1,0) with drift         : 11637.56
##  ARIMA(4,1,1)                    : 11623.02
##  ARIMA(4,1,1) with drift         : 11625.06
##  ARIMA(5,1,0)                    : 11626.8
##  ARIMA(5,1,0) with drift         : 11628.84
## 
## 
## 
##  Best model: ARIMA(1,1,4)
arima_model
## Series: df_data$cases_active 
## ARIMA(1,1,4) 
## 
## Coefficients:
##          ar1      ma1      ma2      ma3     ma4
##       0.9737  -0.6159  -0.1013  -0.0863  0.2176
## s.e.  0.0090   0.0383   0.0443   0.0453  0.0415
## 
## sigma^2 estimated as 785818:  log likelihood=-5800.26
## AIC=11612.52   AICc=11612.64   BIC=11639.89
forecast_length <- 30
arima_predict <- forecast(arima_model, forecast_length)
head(arima_predict)
## $method
## [1] "ARIMA(1,1,4)"
## 
## $model
## Series: df_data$cases_active 
## ARIMA(1,1,4) 
## 
## Coefficients:
##          ar1      ma1      ma2      ma3     ma4
##       0.9737  -0.6159  -0.1013  -0.0863  0.2176
## s.e.  0.0090   0.0383   0.0443   0.0453  0.0415
## 
## sigma^2 estimated as 785818:  log likelihood=-5800.26
## AIC=11612.52   AICc=11612.64   BIC=11639.89
## 
## $level
## [1] 80 95
## 
## $mean
## Time Series:
## Start = 709 
## End = 738 
## Frequency = 1 
##  [1] 40614.16 40307.63 40048.57 39882.01 39719.83 39561.92 39408.15 39258.42
##  [9] 39112.62 38970.65 38832.42 38697.81 38566.74 38439.11 38314.84 38193.83
## [17] 38076.00 37961.26 37849.54 37740.75 37634.82 37531.67 37431.23 37333.44
## [25] 37238.21 37145.48 37055.18 36967.26 36881.65 36798.29
## 
## $lower
## Time Series:
## Start = 709 
## End = 738 
## Frequency = 1 
##           80%         95%
## 709 39478.114  38876.7256
## 710 38391.848  37377.6936
## 711 37403.746  36003.6634
## 712 36566.876  34811.9487
## 713 35617.312  33445.5671
## 714 34581.553  31945.1080
## 715 33477.514  30338.0259
## 716 32317.611  28643.3705
## 717 31110.863  26874.9883
## 718 29864.124  25043.4190
## 719 28582.818  23157.0088
## 720 27271.371  21222.5802
## 721 25933.490  19245.8519
## 722 24572.339  17231.7126
## 723 23190.662  15184.4066
## 724 21790.866  13107.6638
## 725 20375.089  11004.7946
## 726 18945.235   8878.7600
## 727 17503.021   6732.2261
## 728 16049.997   4567.6059
## 729 14587.570   2387.0935
## 730 13117.025    192.6909
## 731 11639.533  -2013.7690
## 732 10156.171  -4230.6039
## 733  8667.926  -6456.2660
## 734  7175.708  -8689.3286
## 735  5680.356 -10928.4742
## 736  4182.644 -13172.4849
## 737  2683.288 -15420.2328
## 738  1182.949 -17670.6729
## 
## $upper
## Time Series:
## Start = 709 
## End = 738 
## Frequency = 1 
##          80%      95%
## 709 41750.21 42351.60
## 710 42223.42 43237.57
## 711 42693.38 44093.47
## 712 43197.15 44952.08
## 713 43822.36 45994.10
## 714 44542.28 47178.73
## 715 45338.78 48478.27
## 716 46199.22 49873.46
## 717 47114.38 51350.25
## 718 48077.18 52897.89
## 719 49082.01 54507.82
## 720 50124.25 56173.04
## 721 51199.99 57887.63
## 722 52305.88 59646.51
## 723 53439.01 61445.27
## 724 54596.79 63279.99
## 725 55776.90 65147.20
## 726 56977.28 67043.76
## 727 58196.05 68966.85
## 728 59431.50 70913.89
## 729 60682.07 72882.55
## 730 61946.32 74870.65
## 731 63222.94 76876.24
## 732 64510.70 78897.48
## 733 65808.48 80932.68
## 734 67115.25 82980.28
## 735 68430.01 85038.84
## 736 69751.88 87107.01
## 737 71080.02 89183.54
## 738 72413.63 91267.25
plot(arima_predict, main = "Predicted Active Cases", col.main = "black")

last_date <- as.Date(df_data[(nrow(df_data)):nrow(df_data),1], format="%Y-%m-%d")
last_date <- last_date + 1
df_arima <- data.frame(
  date=seq(last_date, by = "day", length.out = forecast_length),
  cases_active=arima_predict$mean
)

combined_prediction <- linear_model %>% predict(df_arima)
df_combined_predicted <- data.frame(date=df_arima$date, cases_new=combined_prediction)
df_data$month <- strftime(df_data$date, "%m")
df_data$year <- strftime(df_data$date, "%Y")

df_smooth <- df_data %>%                         
  group_by(date=lubridate::floor_date(df_data$date, "month")) %>% 
  dplyr::summarize(cases_new = mean(cases_new)) %>% 
  data.frame
combined_chart <- plot_ly()
# Predicted Data
combined_chart <- combined_chart %>% 
  add_trace(
    x = df_combined_predicted[["date"]], y = df_combined_predicted[["cases_new"]],
    name = "Future Predicted Data",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'red', width = 3)
  )
# Test Data
combined_chart <- combined_chart %>% 
  add_trace(
    x = df_smooth[["date"]], y = df_smooth[["cases_new"]],
    name = "Actual Data (Rolled to Monthly)",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'skyblue', width = 3)
  )
# Set figure title, x and y-axes titles
combined_chart <- combined_chart %>% layout(
  title = "Prediction of Daily New Cases (Gaussian)",
  xaxis = list(title="Recorded Time"),
  yaxis = list(title="Daily Count of New Cases")
)%>%
  layout(plot_bgcolor='#e5ecf6',
          xaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff'),
          yaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff')
          )

combined_chart

Fiddling with Poisson

poisson_model <- glm(cases_new~cases_active, data=data_train, family=poisson(link="log"))
summary(poisson_model)
## 
## Call:
## glm(formula = cases_new ~ cases_active, family = poisson(link = "log"), 
##     data = data_train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -69.27  -50.04  -12.42   23.54   96.57  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.264e+00  1.223e-03    5938   <2e-16 ***
## cases_active 1.167e-05  7.085e-09    1647   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 3228235  on 495  degrees of freedom
## Residual deviance:  805402  on 494  degrees of freedom
## AIC: 809428
## 
## Number of Fisher Scoring iterations: 5
# plot(gamma_model)

poisson_prediction <- poisson_model %>% predict(data_test)
poisson_compare <- data.frame(actual=data_test$cases_new, predicted=poisson_prediction)
head(poisson_compare)
##    actual predicted
## 1       4  7.264387
## 5       3  7.264422
## 8       0  7.264434
## 10      0  7.264434
## 14      1  7.264504
## 16      1  7.264504
poisson_performance <- data.frame(
  MODEL = "Poisson GLM",
  R2 = R2(poisson_prediction, data_test$cases_new),
  RMSE = RMSE(poisson_prediction, data_test$cases_new),
  MAE = MAE(poisson_prediction, data_test$cases_new)
)
poisson_performance
##         MODEL        R2    RMSE      MAE
## 1 Poisson GLM 0.9721543 6606.18 3816.115
# Chart init
df_predicted <- data.frame(date=data_test$date, cases_new=poisson_prediction)

poisson_chart <- plot_ly()
# Predicted Data
poisson_chart <- poisson_chart %>% 
  add_trace(
    x = df_predicted[["date"]], y = df_predicted[["cases_new"]],
    name = "Predicted Data",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'red', width = 3)
  )
# Test Data
poisson_chart <- poisson_chart %>% 
  add_trace(
    x = df_actual[["date"]], y = df_actual[["cases_new"]],
    name = "Actual Data",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'skyblue', width = 3)
  )

poisson_chart <- poisson_chart %>% 
  add_trace(
    x = df_train[["date"]], y = df_train[["cases_new"]], 
    name = "Train Data",
    type = "scatter",
    mode = "lines",
    line = list(color = 'green', width = 1)
  )

# Set figure title, x and y-axes titles
poisson_chart <- poisson_chart %>% layout(
  title = "Poisson Regression of Daily New Cases",
  xaxis = list(title="Recorded Time"),
  yaxis = list(title="Daily Count of New Cases")
)%>%
  layout(plot_bgcolor='#e5ecf6',
          xaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff'),
          yaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff')
          )


poisson_chart

It seems like the Poisson model is really bad just by the sight of looking at the graph. Therefore do not relies too much on statistical benchmark such as R2 as the source of truth. The easiest way to determine on hindsight is to visualize, use them!

Poisson Prediction

combined_prediction <- poisson_model %>% predict(df_arima)
df_combined_predicted <- data.frame(date=df_arima$date, cases_new=combined_prediction)

combined_chart <- plot_ly()
# Predicted Data
combined_chart <- combined_chart %>% 
  add_trace(
    x = df_combined_predicted[["date"]], y = df_combined_predicted[["cases_new"]],
    name = "Future Predicted Data",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'red', width = 3)
  )
# Test Data
combined_chart <- combined_chart %>% 
  add_trace(
    x = df_smooth[["date"]], y = df_smooth[["cases_new"]],
    name = "Actual Data (Rolled to Monthly)",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'skyblue', width = 3)
  )
# Set figure title, x and y-axes titles
combined_chart <- combined_chart %>% layout(
  title = "Prediction of Daily New Cases (Poisson)",
  xaxis = list(title="Recorded Time"),
  yaxis = list(title="Daily Count of New Cases")
)%>%
  layout(plot_bgcolor='#e5ecf6',
          xaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff'),
          yaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff')
          )

combined_chart
poisson_performance
##         MODEL        R2    RMSE      MAE
## 1 Poisson GLM 0.9721543 6606.18 3816.115
linear_performance
##             MODEL        R2     RMSE      MAE
## 1 Gaussian Linear 0.9721543 906.3865 545.7654
combined_performance <- rbind(linear_performance, poisson_performance)
combined_performance 
##             MODEL        R2      RMSE       MAE
## 1 Gaussian Linear 0.9721543  906.3865  545.7654
## 2     Poisson GLM 0.9721543 6606.1804 3816.1151

End of document